St. Jude OS data CNV analysis

Goals:

We want to characterize the CNV data from St. Jude OS samples in order to determine if the CNVs are consistent across samples within a patient. This will tell us if these tumors are genomically stable or if there are ongoing rearrangements. We also want to determine if there are sub-clones within a single patient which would be revealed by distinct CNV patterns within a patient.

Samples:

For this analysis we got access to many St. Jude OS samples. We also included our single-cell sequencing data which we treated as bulk for the purposes of these analyses.

Workflow

Load libraries

library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.4     ✔ purrr   0.3.4
## ✔ tibble  3.1.2     ✔ dplyr   1.0.7
## ✔ tidyr   1.1.3     ✔ stringr 1.4.0
## ✔ readr   2.0.1     ✔ forcats 0.5.1
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
theme_set(theme_bw())
theme_update(plot.title = element_text(hjust = 0.5))
library(gt)

Make up directory structure

for directoryName in \
  misc \
  slurmOut \
  input\split \
  output \
  output/aligned \
  output/figures \
  output/counts \
  output/varscan/copynumber/ \
  output/vcfs \
  output/vcfs/realignSummary \
  output/vcfs/p53Char \
  output/vcfs/p53Char/clinVar
do
  if [ ! -d ${directoryName} ]
  then
    mkdir -p ${directoryName}
  fi 
done

Download split files from DNA nexus

I had to run these one by one manually due to how the data are organized

sbatch sbatchCmds/forceDownload_1.sh
sbatch sbatchCmds/forceDownload_2.sh
sbatch sbatchCmds/forceDownload_3.sh
sbatch sbatchCmds/forceDownload_4.sh
sbatch sbatchCmds/forceDownload_5.sh
sbatch sbatchCmds/forceDownload_6.sh

Downloaded the md5 data for the raw bam files

cat misc/rawMd5s/rawBamMd5_md5sum_job-G5FQ* > misc/rawBamMd5s.txt

Recombine the split files and check md5sums

sbatch sbatchCmds/catSplitFiles.sh

Make up a list of all chromosomes in the hg38 genome

Kicked out the unknown and random chromosomes

cat /reference/homo_sapiens/hg38/ucsc_assembly/illumina_download/Sequence/WholeGenomeFasta/genome.fa.fai | \
  grep -v "chrUn\|random\|EBV" \
    > misc/chrList.txt

Realign the bam files using a mixed human/mouse reference

if [ ! -d /gpfs0/scratch/mvc002/roberts/stjude/realigned ]
then
    mkdir -p /gpfs0/scratch/mvc002/roberts/stjude/realigned
fi

sbatch sbatchCmds/realign.sh

Call CNVs using varscan

sbatch sbatchCmds/mpileupVarscan.sh

Refine CNV calls using varscan copycaller

sbatch sbatchCmds/varScanCopycaller.sh

Plot the distribution of CNVs found by copynumber

pdf(file = "output/figures/CNVdistribution_copynumber.pdf")
for (sample_name in list.dirs(path = "output/varscan/copynumber",
                              full.names = FALSE,
                              recursive = FALSE)) {
    print(read_tsv(list.files(path = paste("output/varscan/copynumber/",
                                           sample_name,
                                           sep = ""),
                             pattern = "*.copynumber",
                             full.names = TRUE),
                   show_col_types = FALSE) %>%
        sample_n(1000000) %>%
        select(log2_ratio) %>%
        mutate(ratio = 2^log2_ratio) %>%
        ggplot(aes(x = ratio)) +
        geom_histogram(bins = 300) +
        geom_vline(xintercept = 1) +
        scale_x_log10() +
        geom_vline(xintercept = 2) +
        geom_vline(xintercept = 0.5) +
        ggtitle(sample_name))
}
dev.off()
## png 
##   2

Calculate the distance between samples based on CNVs

This approach has the issue that normalization isn’t perfect across the samples due to the extensive number of genomic changes. This artificially increases the distance between similar samples. This likely will not be used in the paper.

perl scripts/varscanToMatrix.pl \
    -v \
    --fileList "output/varscan/copycaller/S*/*chr*.txt" \
        > output/varscan/varscan_distMatrix.txt

perl scripts/varscanToMatrix_threshhold.pl \
    -v \
    --fileList "output/varscan/copycaller/S011*/*chr*.txt" \
        > output/varscan/varscan_distMatrix_threshhold.txt

Combine the varscan outputs into one file

sbatch sbatchCmds/combineVarscan.sh

Make plots of the CNV calls across all the samples

large_bin_size <- 100000

combined_data <- read_tsv("output/varscan/combined_calls.txt",
                          show_col_types = FALSE) %>%
    mutate(chr_num = str_remove(chr, "chr") %>%
                as.numeric()) %>%
    arrange(chr_num, bin) %>%
    mutate(order = seq_len(nrow(.)),
           larger_bin = floor(bin / large_bin_size) * large_bin_size) %>%
    pivot_longer(cols = c(-chr, -bin, -order, -chr_num, -larger_bin),
                 names_to = "sample",
                 values_to = "log_ratio")

patient_key <- data.frame(
    patient = unique(combined_data$sample) %>%
        str_remove("_.+") %>%
        str_replace("S0126", "SJOS030645") %>%
        str_replace("S0127", "SJOS030645") %>%
        str_replace("S0128", "SJOS030645") %>%
        str_replace("S0114", "SJOS031478") %>%
        str_replace("S0116", "SJOS031478") %>%
        str_replace("S0113", "SJOS046149") %>%
        str_replace("S0115", "SJOS046149"),
    sample_type = unique(combined_data$sample) %>%
        str_replace("^S0.+", "Xenograft") %>%
        str_replace(".+_X.+", "Xenograft") %>%
        str_replace(".+_D.+", "Diagnosis") %>%
        str_replace(".+_M.+", "Metastasis") %>%
        str_replace(".+_R.+", "Relapse"),
    sample = unique(combined_data$sample))

combined_data <- full_join(combined_data, patient_key)
## Joining, by = "sample"
chr_cols <- sample(rainbow(length(unique(combined_data$chr))),
                   length(unique(combined_data$chr)),
                   replace = FALSE)

for (patient_name in unique(patient_key$patient)) {
    plot_name <- combined_data %>%
        filter(patient == patient_name) %>%
        group_by(chr, sample, larger_bin) %>%
        summarize(log_ratio = median(log_ratio),
                order = min(order),
                .groups = "drop") %>%
        ggplot(aes(x = order,
                    y = log_ratio,
                    color = chr)) +
        geom_hline(yintercept = 0, color = "black") +
        geom_point(alpha = 0.5,
                    size = 0.5) +
        scale_color_manual(values = chr_cols) +
        labs(y = "Copy number log ratio",
             x = "") +
        facet_wrap(~ sample, ncol = 1)

    ggsave(paste0("output/figures/varscan",
                 patient_name,
                 ".png"),
           plot = plot_name,
           width = 20,
           height = 8)
}

## Take a look at regions commonly amplified/deleted across samples
unique_clones <- c("SJOS001101_M4",
                   "SJOS001111_M1",
                   "SJOS001116_X2",
                   "SJOS001121_X2",
                   "SJOS001126_X1",
                   "SJOS013768_X1",
                   "SJOS016016_X1",
                   "SJOS030101_R1",
                   "SJOS030645_D2",
                   "SJOS031478_D3",
                   "SJOS046149_R2",
                   "SJOS001105_R1")

combined_data %>%
    filter(sample %in% unique_clones) %>%
    group_by(chr, bin) %>%
    summarize(mean_ratio = median(log_ratio),
              order = min(order),
              .groups = "drop") %>%
    ggplot(aes(x = bin,
               y = mean_ratio,
               color = chr)) +
    geom_hline(yintercept = 0, color = "black") +
    geom_point(alpha = 0.5,
               size = 0.5) +
    labs(x = "",
         y = "Mean copy number ratio") +
    facet_wrap(~ chr, scales = "free_x") +
    theme(legend.position = "none")

ggsave("output/figures/clonal_mean_CNV.png",
       width = 15,
       height = 10)

high_chr3 <- combined_data %>%
    filter(sample %in% unique_clones) %>%
    group_by(chr, bin) %>%
    summarize(mean_ratio = median(log_ratio),
              order = min(order),
              .groups = "drop") %>%
    filter(chr == "chr3" & mean_ratio < -0.5)

Look at correlation of CNVs between samples

cnv_cor <- read_tsv("output/varscan/combined_calls.txt",
                     col_names = TRUE,
                     show_col_types = FALSE) %>%
    mutate(chr_bin = paste(chr, bin, sep = "_"), .keep = "unused") %>%
    column_to_rownames("chr_bin") %>%
    cor()

patient_key <- data.frame(patient = rownames(cnv_cor) %>%
                                str_remove("_.+") %>%
                                str_replace("S0126", "SJOS030645") %>%
                                str_replace("S0127", "SJOS030645") %>%
                                str_replace("S0128", "SJOS030645") %>%
                                str_replace("S0114", "SJOS031478") %>%
                                str_replace("S0116", "SJOS031478") %>%
                                str_replace("S0113", "SJOS046149") %>%
                                str_replace("S0115", "SJOS046149"),
                          sample_type = rownames(cnv_cor) %>%
                                str_replace("^S0.+", "Xenograft") %>%
                                str_replace(".+_X.+", "Xenograft") %>%
                                str_replace(".+_D.+", "Diagnosis") %>%
                                str_replace(".+_M.+", "Metastasis") %>%
                                str_replace(".+_R.+", "Relapse"),
                          row.names = rownames(cnv_cor))

col_patient <- list(patient = c("#a6cee3",
                                "#1f78b4",
                                "#b2df8a",
                                "#33a02c",
                                "#fb9a99",
                                "#e31a1c",
                                "#fdbf6f",
                                "#ff7f00",
                                "#cab2d6",
                                "#6a3d9a",
                                "#ffff99",
                                "#b15928"),
                    sample_type = c("#a6cee3",
                                    "#1f78b4",
                                    "#b2df8a",
                                    "#33a02c"))

names(col_patient$patient) <- patient_key$patient %>%
    as.factor() %>%
    levels()

names(col_patient$sample_type) <- patient_key$sample_type %>%
    as_factor() %>%
    levels()

png("output/figures/cnv_correlation_heatmap.png",
    width = 3500,
    height = 3000,
    res = 300)
pheatmap::pheatmap(cnv_cor,
                   annotation_row = patient_key,
                   annotation_col = patient_key,
                   annotation_colors = col_patient)
dev.off()
## png 
##   3

SNP calling

Call snps using bcftools mpileup and filter SNPs

Filter out indels and SNPs with qual < 20 and depth < 20

# Make up a list of chromosomes to analyze for use by parallel
grep "^>" \
    /reference/homo_sapiens/hg38/ucsc_assembly/illumina_download/Sequence/WholeGenomeFasta/genome.fa \
    | grep -v "random\|chrUn\|chrEBV" \
    | perl -pe 's/>//' \
        > misc/chrList.txt

sbatch sbatchCmds/callVariants.sh
sbatch sbatchCmds/indexVcfs.sh

Calculate distance between samples based on SNP calls

module load GCC/9.3.0 \
            GCCcore/9.3.0 \
            BCFtools/1.11 \
            SAMtools/1.15

perl scripts/vcfToMatrix_parallel.pl \
    --fileList "output/vcfs/S*vcf.gz" \
    --threads 20 \
        > output/vcfs/snpDistanceMatrix.txt

Check P53 for mutations using SnpEff

sbatch sbatchCmds/TP53MutChar.sh

wget https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar_20220416.vcf.gz
mv clinvar_20220416.vcf.gz misc/
zcat misc/clinvar_20220416.vcf.gz \
    | awk '$1 ~ /^#/ || \
           $1 == 17 && \
           $2 >= 7600000 && \
           $2 <= 7690000 \
           {print}' \
    | perl -pe 's/^17/chr17/' \
    | gzip \
    > misc/clinvar_TP53.vcf.gz

Merge clinvar data with TP53Muts.txt

combineClinvar.sh
filter_list <- function(x) {
    str_split(x, ",") %>%
    unlist() %>%
    str_subset("TP53") %>%
    str_subset("downstream_gene_variant",
               negate = TRUE) %>%
    str_subset("upstream_gene_variant",
               negate = TRUE) %>%
    str_subset("intergenic_region",
               negate = TRUE) %>%
    str_subset("intron_variant",
               negate = TRUE) %>%
    str_subset("[35]_prime_UTR_variant",
               negate = TRUE) %>%
    str_c(collapse = ",")
}

file_list <- list.files("output/vcfs/p53Char/clinVar",
                        pattern = ".+_G.+",
                        full.names = TRUE)

combined_data <- tibble(sample = character())

for (file_name in file_list) {
    combined_data <-
        read_tsv(file_name,
                 col_names = TRUE,
                 col_types = cols(.default = "c")) %>%
        filter(grepl("TP53", ANN)) %>%
        rowwise() %>%
        mutate(ANN = filter_list(ANN),
               sample = str_match(file_name, ".+/(.+).txt")[2]) %>%
        filter(ANN != "") %>%
        bind_rows(combined_data) %>%
        relocate(sample, CLNSIG, ANN)
}

write_tsv(combined_data,
          file = "output/vcfs/p53Char/p53Char_combined_germline.txt")
perl scripts/cmpVarscanCalls.pl \
    --binSize 100 \
    --inputFiles "output/varscan/copycaller/*/*chr17.txt" \
        > output/varscan/combined_calls_chr17.txt
## 
## Printing out results

Plot P53 CNV data

P53_loc <- list(min = 7661779,
                max = 7687538)

combined_data <- read_tsv("output/varscan/combined_calls_chr17.txt",
                          show_col_types = FALSE) %>%
    mutate(chr_num = str_remove(chr, "chr") %>%
                as.numeric()) %>%
    filter(chr_num == 17 & bin >= 7550000 & bin <= 7800000) %>%
    arrange(chr_num, bin) %>%
    pivot_longer(cols = c(-chr, -bin, -chr_num),
                 names_to = "sample",
                 values_to = "log_ratio")

patient_key <- data.frame(
    patient = unique(combined_data$sample) %>%
        str_remove("_.+") %>%
        str_replace("S0126", "SJOS030645") %>%
        str_replace("S0127", "SJOS030645") %>%
        str_replace("S0128", "SJOS030645") %>%
        str_replace("S0114", "SJOS031478") %>%
        str_replace("S0116", "SJOS031478") %>%
        str_replace("S0113", "SJOS046149") %>%
        str_replace("S0115", "SJOS046149"),
    sample_type = unique(combined_data$sample) %>%
        str_replace("^S0.+", "Xenograft") %>%
        str_replace(".+_X.+", "Xenograft") %>%
        str_replace(".+_D.+", "Diagnosis") %>%
        str_replace(".+_M.+", "Metastasis") %>%
        str_replace(".+_R.+", "Relapse"),
    sample = unique(combined_data$sample))

combined_data <- full_join(combined_data, patient_key) %>%
    mutate(amplification = if_else(log_ratio > 0.5,
                                   "amplification",
                                   if_else(log_ratio < -0.5,
                                           "deletion",
                                           "normal")))
## Joining, by = "sample"
ggplot(combined_data,
       aes(x = bin,
           y  = sample,
           fill = log_ratio)) +
    geom_tile() +
    scale_fill_gradient2(low = "#8e0152",
                         mid = "white",
                         high = "#276419") +
    facet_grid(patient ~ .,
               scales = "free_y",
               space = "free") +
    theme(strip.text.y.right = element_text(angle = 0)) +
    geom_vline(xintercept = P53_loc$min,
               linetype = "dashed",
               colour = "red") +
    geom_vline(xintercept = P53_loc$max,
               linetype = "dashed",
               colour = "red") +
    labs(x = "Chromosome 17 position",
         y = "",
         title = "TP53 copy numbers")

ggsave(file = "output/figures/p53Cnv_chr17.png",
       width = 10,
       height = 10)
sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Red Hat Enterprise Linux 8.1 (Ootpa)
## 
## Matrix products: default
## BLAS:   /gpfs0/export/apps/opt/R/4.1.0-foss-2020a/lib64/R/lib/libRblas.so
## LAPACK: /gpfs0/export/apps/opt/R/4.1.0-foss-2020a/lib64/R/lib/libRlapack.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] gt_0.3.0        forcats_0.5.1   stringr_1.4.0   dplyr_1.0.7    
##  [5] purrr_0.3.4     readr_2.0.1     tidyr_1.1.3     tibble_3.1.2   
##  [9] ggplot2_3.3.4   tidyverse_1.3.1 rmarkdown_2.9  
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.1.1  xfun_0.24         bslib_0.2.5.1     haven_2.4.1      
##  [5] colorspace_2.0-1  vctrs_0.3.8       generics_0.1.0    htmltools_0.5.1.1
##  [9] yaml_2.2.1        utf8_1.2.1        rlang_0.4.11      jquerylib_0.1.4  
## [13] pillar_1.6.1      glue_1.4.2        withr_2.4.2       DBI_1.1.1        
## [17] dbplyr_2.1.1      modelr_0.1.8      readxl_1.3.1      lifecycle_1.0.0  
## [21] munsell_0.5.0     gtable_0.3.0      cellranger_1.1.0  rvest_1.0.0      
## [25] evaluate_0.14     knitr_1.33        tzdb_0.1.2        ps_1.6.0         
## [29] fansi_0.5.0       broom_0.7.7       Rcpp_1.0.7        backports_1.2.1  
## [33] scales_1.1.1      jsonlite_1.7.2    fs_1.5.0          hms_1.1.0        
## [37] digest_0.6.27     stringi_1.6.2     grid_4.1.0        cli_2.5.0        
## [41] tools_4.1.0       magrittr_2.0.1    sass_0.4.0        crayon_1.4.1     
## [45] pkgconfig_2.0.3   ellipsis_0.3.2    praise_1.0.0      xml2_1.3.2       
## [49] reprex_2.0.0      lubridate_1.7.10  rstudioapi_0.13   assertthat_0.2.1 
## [53] httr_1.4.2        R6_2.5.0          compiler_4.1.0